文件摘要器（PDF/TXT/MD/DOCX → 結構化摘要）

17th鐵人賽

frankfrank8785

2025-09-17 12:03:33

104 瀏覽

分享至

📦 安裝相依套件（新增）
npm i pdf-parse mammoth

🆕 程式碼

src/utils/io.js（新增）
// src/utils/io.js
import fs from "fs";
import path from "path";
import pdfParse from "pdf-parse";
import mammoth from "mammoth";

/** 建立資料夾 */
export function ensureDir(dir) {
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
}

/** 由 Content-Type 推測副檔名（簡化版） */
function extFromContentType(ct = "") {
if (ct.includes("pdf")) return ".pdf";
if (ct.includes("msword")) return ".doc";
if (ct.includes("officedocument.wordprocessingml.document")) return ".docx";
if (ct.includes("text/plain")) return ".txt";
if (ct.includes("markdown")) return ".md";
return ".bin";
}

/** 下載遠端檔案至暫存（Node 18+ 有 fetch） */
export async function downloadToTemp(url, outDir = "outputs/downloads") {
ensureDir(outDir);
const res = await fetch(url);
if (!res.ok) throw new Error(下載失敗：${res.status} ${res.statusText});
const buf = Buffer.from(await res.arrayBuffer());
const ct = res.headers.get("content-type") || "";
const ext = extFromContentType(ct);
const fp = path.join(outDir, dl_${Date.now()}${ext});
fs.writeFileSync(fp, buf);
return fp;
}

/** 讀純文字（.txt/.md） */
function readTxtLike(filePath) {
return fs.readFileSync(filePath, "utf-8");
}

/** 讀 PDF → text */
async function readPdf(filePath) {
const buf = fs.readFileSync(filePath);
const data = await pdfParse(buf);
return data.text || "";
}

/** 讀 DOCX → text */
async function readDocx(filePath) {
const buf = fs.readFileSync(filePath);
const { value } = await mammoth.extractRawText({ buffer: buf });
return value || "";
}

/** 依副檔名讀取文字內容 */
export async function readTextFile(filePath) {
const ext = path.extname(filePath).toLowerCase();
if (ext === ".txt" || ext === ".md") return readTxtLike(filePath);
if (ext === ".pdf") return await readPdf(filePath);
if (ext === ".docx") return await readDocx(filePath);
throw new Error(不支援的檔案格式：${ext});
}

/** 輸出檔案：JSON/MD */
export function writeJson(filePath, obj) {
fs.writeFileSync(filePath, JSON.stringify(obj, null, 2), "utf-8");
}
export function writeText(filePath, text) {
fs.writeFileSync(filePath, text, "utf-8");
}

src/day11_doc_summarizer.js（新增）
// src/day11_doc_summarizer.js
import path from "path";
import { openai } from "./aiClient.js";
import { ensureDir, downloadToTemp, readTextFile, writeJson, writeText } from "./utils/io.js";

/** 粗略切塊（以字元數近似 token，預設每塊 ~1200 字） */
function chunkText(text, chunkSize = 1200, overlap = 120) {
const chunks = [];
let i = 0;
while (i < text.length) {
const end = Math.min(text.length, i + chunkSize);
chunks.push(text.slice(i, end));
i = end - overlap; // 保留重疊避免斷句過硬
if (i < 0) i = 0;
}
return chunks;
}

/** 單塊摘要（map 階段） */
async function summarizeChunk(chunk, opts) {
const { tone = "professional", length = "medium" } = opts || {};
const res = await openai.chat.completions.create({
model: "gpt-4o-mini",
temperature: 0.3,
messages: [
{
role: "system",
content:
"你是嚴謹的中文技術編輯。請以重點清單 + 2~3 句摘要回覆，不可虛構內容。",
},
{
role: "user",
content: 請摘要以下內容。語氣：${tone}。長度：${length}。\n\n + chunk,
},
],
});
return res.choices?.[0]?.message?.content?.trim() || "";
}

/** reduce 階段：彙整所有塊的摘要成一份高階摘要＋重點清單 */
async function reduceSummaries(summaries, opts) {
const { tone = "professional", length = "medium" } = opts || {};
const joined = summaries.map((s, i) => # 小節${i + 1}\n${s}).join("\n\n");
const res = await openai.chat.completions.create({
model: "gpt-4o-mini",
temperature: 0.3,
messages: [
{
role: "system",
content:
"你是嚴謹的中文總編輯。整合所有小節摘要，產出：\n1) TL;DR（3~5 句）\n2) Outline（6~12 條）\n3) KeyPoints（5~10 條）\n4) ActionItems（可執行清單，若無則空陣列）\n5) Questions（讀者可能想追問的 3~6 題）\n請以純 JSON 回覆，格式：{"tldr":"...","outline":[...],"keyPoints":[...],"actionItems":[...],"questions":[...]}\n內容不得虛構。",
},
{
role: "user",
content: 語氣：${tone}，長度：${length}。\n以下為各小節摘要，請彙整：\n\n${joined},
},
],
});
const raw = res.choices?.[0]?.message?.content?.trim() || "{}";
const json = raw.match(/(?:json)?\s*([\s\S]*?)/i)?.[1] ?? raw;
return JSON.parse(json);
}

/** 從原文抓出可能適合引用的金句（避免虛構） */
async function extractQuotes(original, limit = 5) {
const res = await openai.chat.completions.create({
model: "gpt-4o-mini",
temperature: 0.2,
messages: [
{
role: "system",
content:
"從原文中挑選最多 5 句適合直接引用的『原句』，不可改寫；若沒有合適句子可少於 5。",
},
{
role: "user",
content: 請由以下原文挑選金句（以 JSON 陣列回覆）：\n${original.slice(0, 16000)},
},
],
});
const raw = res.choices?.[0]?.message?.content?.trim() || "[]";
const json = raw.match(/(?:json)?\s*([\s\S]*?)/i)?.[1] ?? raw;
try { return JSON.parse(json); } catch { return []; }
}

/**

主流程：讀檔→切塊→map→reduce→輸出
@param {object} opts
@param {string} [opts.filePath] - 本地檔案
@param {string} [opts.url] - 遠端 URL
@param {number} [opts.chunkSize=1200]
@param {number} [opts.overlap=120]
@param {("short"|"medium"|"long")} [opts.length="medium"]
@param {string} [opts.tone="professional"]
*/
export async function summarizeDocument(opts = {}) {
let { filePath, url, chunkSize = 1200, overlap = 120, length = "medium", tone = "professional" } = opts;

// 1) 取得檔案本地路徑
if (!filePath && !url) throw new Error("請提供 filePath 或 url 其一。");
if (!filePath && url) filePath = await downloadToTemp(url);

// 2) 讀取文件文字
const fullText = (await readTextFile(filePath)).trim();
if (!fullText) throw new Error("文件內容為空或無法解析。");

// 3) 切塊 & map
const chunks = chunkText(fullText, chunkSize, overlap);
const perChunkSummaries = [];
for (const c of chunks) {
const s = await summarizeChunk(c, { tone, length });
perChunkSummaries.push(s);
}

// 4) reduce 彙整
const merged = await reduceSummaries(perChunkSummaries, { tone, length });

// 5) 金句（直接取原文）
const quotes = await extractQuotes(fullText, 5);

// 6) 組裝輸出
const titleGuess = path.basename(filePath);
const result = {
title: titleGuess,
wordCount: fullText.length,
chunks: chunks.length,
tldr: merged.tldr || "",
outline: merged.outline || [],
keyPoints: merged.keyPoints || [],
actionItems: merged.actionItems || [],
questions: merged.questions || [],
quotes,
createdAt: new Date().toISOString(),
};

// 7) 落檔
const outDir = path.join("outputs", "docs");
ensureDir(outDir);
const stamp = Date.now();
const jsonPath = path.join(outDir, summary_${stamp}.json);
const mdPath = path.join(outDir, summary_${stamp}.md);

writeJson(jsonPath, result);
writeText(
mdPath,
[
# ${result.title},
, `- 產出時間：${result.createdAt}`, `- 字數：約 ${result.wordCount}`, `- 分塊數：${result.chunks}`, ,
## TL;DR,
result.tldr || "(無)",
, `## Outline`, ...(result.outline?.map((o, i) => `${i + 1}. ${o}`) || ["(無)"]), ,
## Key Points,
...(result.keyPoints?.map((o) => - ${o}) || ["(無)"]),
, `## Action Items`, ...(result.actionItems?.map((o) => `- [ ] ${o}`) || ["- (無)"]), ,
## Questions,
...(result.questions?.map((o) => - ${o}) || ["- (無)"]),
, `## 引用金句`, ...(result.quotes?.map((q) => `> ${q}`) || ["> (無)"]), ,
].join("\n")
);

return { jsonPath, mdPath, meta: result };
}

index.js（修改：加入 docsum 入口）
// index.js（只示範新增片段，保留你原有分支）
import { summarizeDocument } from "./src/day11_doc_summarizer.js";

// ...前略（既有 args 解析與其他 task）

async function main() {
const task = args.task || "chat";

if (task === "docsum") {
const filePath = args.filePath || null;
const url = args.url || null;
const length = args.length || "medium"; // short | medium | long
const tone = args.tone || "professional"; // friendly | professional
const chunkSize = args.chunkSize ? Number(args.chunkSize) : 1200;
const overlap = args.overlap ? Number(args.overlap) : 120;

const out = await summarizeDocument({ filePath, url, length, tone, chunkSize, overlap });
console.log("\n=== 文件摘要完成 ===");
console.log("- JSON：", out.jsonPath);
console.log("- Markdown：", out.mdPath);
console.log("\nTL;DR：\n", out.meta.tldr);

// ...其餘 task 分支維持不變
} else {
// 既有的 else 分支省略
}
}

main().catch((e) => {
console.error("發生錯誤：", e.message);
process.exit(1);
});

package.json（新增 Scripts）
{
"scripts": {
"day11:txt": "node index.js --task docsum --filePath sample/article.md --length medium --tone professional",
"day11:pdf": "node index.js --task docsum --filePath sample/whitepaper.pdf --length short --tone friendly",
"day11:url": "node index.js --task docsum --url https://example.com/sample.pdf --length long --tone professional"
}
}

▶️ 如何執行（CLI）